Importing Dataset and Data Preprocessing

In [1]:
# Import packages and load dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import math
import os
import re
import time

master_tm = time.time()
%matplotlib inline

import warnings
warnings.simplefilter('ignore')

from IPython.core.display import display, HTML
from IPython.display import Image
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from matplotlib import cm

from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

from sklearn.cluster import KMeans

try:
    import pydot
except:
    !pip install pydot
    import pydot

df = pd.read_excel('Bike_Sharing2.xlsx')
df
Out[1]:
S_No index Time Date season yr mnth hr holiday weekday workingday weathersit temp atemp hum windspeed casual registered
0 1 1.0 2011-01-01 00:00:00 2011-01-01 1 0 1 0 0 6 0 1.0 0.24 0.2879 0.81 0.0000 3.0 13.0
1 2 2.0 2011-01-01 01:00:00 2011-01-01 1 0 1 1 0 6 0 1.0 0.22 0.2727 0.80 0.0000 8.0 32.0
2 3 3.0 2011-01-01 02:00:00 2011-01-01 1 0 1 2 0 6 0 1.0 0.22 0.2727 0.80 0.0000 5.0 27.0
3 4 4.0 2011-01-01 03:00:00 2011-01-01 1 0 1 3 0 6 0 1.0 0.24 0.2879 0.75 0.0000 3.0 10.0
4 5 5.0 2011-01-01 04:00:00 2011-01-01 1 0 1 4 0 6 0 1.0 0.24 0.2879 0.75 0.0000 0.0 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
17539 17540 17375.0 2012-12-31 19:00:00 2012-12-31 1 1 12 19 0 1 1 2.0 0.26 0.2576 0.60 0.1642 11.0 108.0
17540 17541 17376.0 2012-12-31 20:00:00 2012-12-31 1 1 12 20 0 1 1 2.0 0.26 0.2576 0.60 0.1642 8.0 81.0
17541 17542 17377.0 2012-12-31 21:00:00 2012-12-31 1 1 12 21 0 1 1 1.0 0.26 0.2576 0.60 0.1642 7.0 83.0
17542 17543 17378.0 2012-12-31 22:00:00 2012-12-31 1 1 12 22 0 1 1 1.0 0.26 0.2727 0.56 0.1343 13.0 48.0
17543 17544 17379.0 2012-12-31 23:00:00 2012-12-31 1 1 12 23 0 1 1 1.0 0.26 0.2727 0.65 0.1343 12.0 37.0

17544 rows × 18 columns

In [2]:
print(df.info())
df.describe()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17544 entries, 0 to 17543
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   S_No        17544 non-null  int64         
 1   index       17379 non-null  float64       
 2   Time        17544 non-null  datetime64[ns]
 3   Date        17379 non-null  datetime64[ns]
 4   season      17544 non-null  int64         
 5   yr          17544 non-null  int64         
 6   mnth        17544 non-null  int64         
 7   hr          17544 non-null  int64         
 8   holiday     17544 non-null  int64         
 9   weekday     17544 non-null  int64         
 10  workingday  17544 non-null  int64         
 11  weathersit  17379 non-null  float64       
 12  temp        17379 non-null  float64       
 13  atemp       17379 non-null  float64       
 14  hum         17379 non-null  float64       
 15  windspeed   17379 non-null  float64       
 16  casual      17379 non-null  float64       
 17  registered  17379 non-null  float64       
dtypes: datetime64[ns](2), float64(8), int64(8)
memory usage: 2.4 MB
None
Out[2]:
S_No index season yr mnth hr holiday weekday workingday weathersit temp atemp hum windspeed casual registered
count 17544.000000 17379.0000 17544.000000 17544.000000 17544.000000 17544.000000 17544.000000 17544.000000 17544.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000
mean 8772.500000 8690.0000 2.496580 0.500684 6.519836 11.500000 0.028728 2.997264 0.683995 1.425283 0.496987 0.475775 0.627229 0.190098 35.676218 153.786869
std 5064.660897 5017.0295 1.110079 0.500014 3.449649 6.922384 0.167045 2.003472 0.464928 0.639357 0.192556 0.171850 0.192930 0.122340 49.305030 151.357286
min 1.000000 1.0000 1.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.020000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 4386.750000 4345.5000 2.000000 0.000000 4.000000 5.750000 0.000000 1.000000 0.000000 1.000000 0.340000 0.333300 0.480000 0.104500 4.000000 34.000000
50% 8772.500000 8690.0000 3.000000 1.000000 7.000000 11.500000 0.000000 3.000000 1.000000 1.000000 0.500000 0.484800 0.630000 0.194000 17.000000 115.000000
75% 13158.250000 13034.5000 3.000000 1.000000 10.000000 17.250000 0.000000 5.000000 1.000000 2.000000 0.660000 0.621200 0.780000 0.253700 48.000000 220.000000
max 17544.000000 17379.0000 4.000000 1.000000 12.000000 23.000000 1.000000 6.000000 1.000000 4.000000 1.000000 1.000000 1.000000 0.850700 367.000000 886.000000
In [3]:
## Data Cleaning
df.drop(columns=['S_No','index','Date'],inplace=True)

# Rename column names for readability
df.rename(columns= {'Time':'time',
                    'yr':'year',
                    'mnth':'month',
                    'hr':'hour',
                    'holiday':'is_holiday',
                    'weekday':'day_of_week',
                    'workingday':'is_working_day',
                    'weathersit':'weather_type',
                    'atemp':'app_temp',
                    'hum':'humidity',
                    'windspeed':'wind_speed'}, inplace=True)

# Formatting and extracting datetime data
df.insert(4, 'day', df['time'].dt.day)
print(df.isna().sum())

df_master = df.copy()
df_master
time                0
season              0
year                0
month               0
day                 0
hour                0
is_holiday          0
day_of_week         0
is_working_day      0
weather_type      165
temp              165
app_temp          165
humidity          165
wind_speed        165
casual            165
registered        165
dtype: int64
Out[3]:
time season year month day hour is_holiday day_of_week is_working_day weather_type temp app_temp humidity wind_speed casual registered
0 2011-01-01 00:00:00 1 0 1 1 0 0 6 0 1.0 0.24 0.2879 0.81 0.0000 3.0 13.0
1 2011-01-01 01:00:00 1 0 1 1 1 0 6 0 1.0 0.22 0.2727 0.80 0.0000 8.0 32.0
2 2011-01-01 02:00:00 1 0 1 1 2 0 6 0 1.0 0.22 0.2727 0.80 0.0000 5.0 27.0
3 2011-01-01 03:00:00 1 0 1 1 3 0 6 0 1.0 0.24 0.2879 0.75 0.0000 3.0 10.0
4 2011-01-01 04:00:00 1 0 1 1 4 0 6 0 1.0 0.24 0.2879 0.75 0.0000 0.0 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
17539 2012-12-31 19:00:00 1 1 12 31 19 0 1 1 2.0 0.26 0.2576 0.60 0.1642 11.0 108.0
17540 2012-12-31 20:00:00 1 1 12 31 20 0 1 1 2.0 0.26 0.2576 0.60 0.1642 8.0 81.0
17541 2012-12-31 21:00:00 1 1 12 31 21 0 1 1 1.0 0.26 0.2576 0.60 0.1642 7.0 83.0
17542 2012-12-31 22:00:00 1 1 12 31 22 0 1 1 1.0 0.26 0.2727 0.56 0.1343 13.0 48.0
17543 2012-12-31 23:00:00 1 1 12 31 23 0 1 1 1.0 0.26 0.2727 0.65 0.1343 12.0 37.0

17544 rows × 16 columns

Regression Analysis

Regression Data Preparation

In [4]:
# Further Data Cleaning
df_regression = df.copy()
df_regression.dropna(axis=0, how='any', inplace=True)
df_regression.reset_index(drop=True, inplace=True)
df_regression['total_users'] = df_regression['casual'] + df_regression['registered']
df_regression.drop(columns=['casual',
                            'registered'], inplace=True)

df_regression['weather_type'] = df_regression['weather_type'].astype('int64')

print(df_regression.isna().sum())
df_regression
time              0
season            0
year              0
month             0
day               0
hour              0
is_holiday        0
day_of_week       0
is_working_day    0
weather_type      0
temp              0
app_temp          0
humidity          0
wind_speed        0
total_users       0
dtype: int64
Out[4]:
time season year month day hour is_holiday day_of_week is_working_day weather_type temp app_temp humidity wind_speed total_users
0 2011-01-01 00:00:00 1 0 1 1 0 0 6 0 1 0.24 0.2879 0.81 0.0000 16.0
1 2011-01-01 01:00:00 1 0 1 1 1 0 6 0 1 0.22 0.2727 0.80 0.0000 40.0
2 2011-01-01 02:00:00 1 0 1 1 2 0 6 0 1 0.22 0.2727 0.80 0.0000 32.0
3 2011-01-01 03:00:00 1 0 1 1 3 0 6 0 1 0.24 0.2879 0.75 0.0000 13.0
4 2011-01-01 04:00:00 1 0 1 1 4 0 6 0 1 0.24 0.2879 0.75 0.0000 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
17374 2012-12-31 19:00:00 1 1 12 31 19 0 1 1 2 0.26 0.2576 0.60 0.1642 119.0
17375 2012-12-31 20:00:00 1 1 12 31 20 0 1 1 2 0.26 0.2576 0.60 0.1642 89.0
17376 2012-12-31 21:00:00 1 1 12 31 21 0 1 1 1 0.26 0.2576 0.60 0.1642 90.0
17377 2012-12-31 22:00:00 1 1 12 31 22 0 1 1 1 0.26 0.2727 0.56 0.1343 61.0
17378 2012-12-31 23:00:00 1 1 12 31 23 0 1 1 1 0.26 0.2727 0.65 0.1343 49.0

17379 rows × 15 columns

Data Visualization

Univariate Analysis

In [5]:
# Univariate Analysis - Understanding distribution and shape of each variable
id_vars = ['time']
num_vars = list(df_regression.columns)[-5:]
cat_vars = list(df_regression.columns)[1:-5]

for col in list(df_regression.columns)[1:]:
    f, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    if col in cat_vars:
        axes[0].bar(x=list(df_regression[col].value_counts().index), 
                    height=list(df_regression[col].value_counts().values))
        if col in ['season','year','week','is_holiday','is_working_day','weather_type']:
            y_min = int(math.floor(np.min(df_regression[col].value_counts())/500))*500
            y_max = int(math.ceil(np.max(df_regression[col].value_counts())/500))*500
            axes[0].set_ylim([y_min, y_max])
        elif col in ['month','day','hour','day_of_week']:
            y_min = int(math.floor(np.min(df_regression[col].value_counts())/50))*50
            y_max = int(math.ceil(np.max(df_regression[col].value_counts())/50))*50
            axes[0].set_ylim([y_min, y_max])
    else:
        df_regression[col].hist(ax = axes[0], grid=False)
    
    axes[0].set_title('Distribution of \''+ col + '\'')
    
    df_regression.boxplot(column = col, ax = axes[1], grid=False)
    plt.tight_layout()
    plt.show()

Bivariate Analysis

In [6]:
# Bi-Variate Analysis - Understanding pairwise relationships between variables
start = time.time()
sns.pairplot(df_regression.iloc[:,1:])
print('Runtime:',time.time()-start,'seconds')
Runtime: 39.42766332626343 seconds
In [7]:
# Creating categorical dummy variables
quantitative_data = df_regression[num_vars].copy()
categorical_data = df_regression[cat_vars].copy()
categorical_data.astype('category')

dummy = pd.get_dummies(categorical_data, columns=list(categorical_data.columns))
df_regression = pd.concat([quantitative_data,dummy],axis=1)
df_regression = pd.concat([df_regression.pop('total_users'),df_regression], axis=1)
df_regression
Out[7]:
total_users temp app_temp humidity wind_speed season_1 season_2 season_3 season_4 year_0 ... day_of_week_3 day_of_week_4 day_of_week_5 day_of_week_6 is_working_day_0 is_working_day_1 weather_type_1 weather_type_2 weather_type_3 weather_type_4
0 16.0 0.24 0.2879 0.81 0.0000 1 0 0 0 1 ... 0 0 0 1 1 0 1 0 0 0
1 40.0 0.22 0.2727 0.80 0.0000 1 0 0 0 1 ... 0 0 0 1 1 0 1 0 0 0
2 32.0 0.22 0.2727 0.80 0.0000 1 0 0 0 1 ... 0 0 0 1 1 0 1 0 0 0
3 13.0 0.24 0.2879 0.75 0.0000 1 0 0 0 1 ... 0 0 0 1 1 0 1 0 0 0
4 1.0 0.24 0.2879 0.75 0.0000 1 0 0 0 1 ... 0 0 0 1 1 0 1 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
17374 119.0 0.26 0.2576 0.60 0.1642 1 0 0 0 0 ... 0 0 0 0 0 1 0 1 0 0
17375 89.0 0.26 0.2576 0.60 0.1642 1 0 0 0 0 ... 0 0 0 0 0 1 0 1 0 0
17376 90.0 0.26 0.2576 0.60 0.1642 1 0 0 0 0 ... 0 0 0 0 0 1 1 0 0 0
17377 61.0 0.26 0.2727 0.56 0.1343 1 0 0 0 0 ... 0 0 0 0 0 1 1 0 0 0
17378 49.0 0.26 0.2727 0.65 0.1343 1 0 0 0 0 ... 0 0 0 0 0 1 1 0 0 0

17379 rows × 93 columns

Correlation Matrix (Heatmap)

In [8]:
# Correlation Matrix (Heatmap)
corr = df_regression.iloc[:,:5].corr(method='pearson')

plt.figure(figsize=(15, 15))
sns.heatmap(corr, annot=True, cmap="YlGnBu",cbar_kws={'label': 'Correlation'})
plt.tight_layout()
plt.xticks(rotation=45, horizontalalignment='right')
plt.yticks(rotation=0)

plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
In [9]:
# Dropping highly correlated apparent temperature variable
df_regression.drop(columns=['app_temp'], inplace=True)
df_regression
Out[9]:
total_users temp humidity wind_speed season_1 season_2 season_3 season_4 year_0 year_1 ... day_of_week_3 day_of_week_4 day_of_week_5 day_of_week_6 is_working_day_0 is_working_day_1 weather_type_1 weather_type_2 weather_type_3 weather_type_4
0 16.0 0.24 0.81 0.0000 1 0 0 0 1 0 ... 0 0 0 1 1 0 1 0 0 0
1 40.0 0.22 0.80 0.0000 1 0 0 0 1 0 ... 0 0 0 1 1 0 1 0 0 0
2 32.0 0.22 0.80 0.0000 1 0 0 0 1 0 ... 0 0 0 1 1 0 1 0 0 0
3 13.0 0.24 0.75 0.0000 1 0 0 0 1 0 ... 0 0 0 1 1 0 1 0 0 0
4 1.0 0.24 0.75 0.0000 1 0 0 0 1 0 ... 0 0 0 1 1 0 1 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
17374 119.0 0.26 0.60 0.1642 1 0 0 0 0 1 ... 0 0 0 0 0 1 0 1 0 0
17375 89.0 0.26 0.60 0.1642 1 0 0 0 0 1 ... 0 0 0 0 0 1 0 1 0 0
17376 90.0 0.26 0.60 0.1642 1 0 0 0 0 1 ... 0 0 0 0 0 1 1 0 0 0
17377 61.0 0.26 0.56 0.1343 1 0 0 0 0 1 ... 0 0 0 0 0 1 1 0 0 0
17378 49.0 0.26 0.65 0.1343 1 0 0 0 0 1 ... 0 0 0 0 0 1 1 0 0 0

17379 rows × 92 columns

Linear Regression (with auto feature selection)

In [10]:
df = df_regression.copy()
y = list(df.columns)[0]

rename_columns = {}

for column_name in list(df.columns):
    ## Go through each column name and check if it contains non-alphanumeric
    if re.search('\W', column_name):
        ## Remove non-alphanumeric at the start or end of the column names or replace with '_' if in the middle
        new_name = re.sub('^\W+', '', column_name)
        new_name = re.sub('\W+$', '', new_name)
        rename_columns.update({column_name: re.sub('\W+', '_', new_name)})

df.rename(columns=rename_columns,
          inplace=True)

modeleq = ' + '.join(list(df.columns)).replace('+', '~', 1)
print('\nModel equation:', modeleq, '\n')

maxR2 = -np.inf
bmodeleq = modeleq
numx = df.shape[1] - 1
x1x2 = False #interaction variables not yet included
df2 = df.copy()

#print(ols(modeleq, df).fit().summary2())

while True:
    regout = ols(modeleq, df).fit()
    R2 = regout.rsquared_adj
    
    if R2 > maxR2:
        maxR2 = R2
        bmodeleq = modeleq

    print('\nAdjusted R2 =', R2, 'for', numx, 'Xs.')
   
    if numx == 1:
        print('Variable left:', modeleq[modeleq.find('~') + 2 :])
        if x1x2:
            #one xvar left
            #get out of 'while' loop:
            break
            
        else:
            #add interaction variables for original untransformed variables in best model so far
            
            numx = bmodeleq.count('+') + 1
            print('\nRestarting from best model (with', numx, 'Xs) found so far...')
            
            colname = bmodeleq.replace('~', '+').split(' + ')
            df = df2[colname]
            colname = colname[1:] #remove y
            
                    
            df2 = df.copy()
                    
            #delete any x too highly correlated with another x, to avoid collinearity
            
            corv = pd.DataFrame() #start empty dataframe for corr(Xs, y) to come
            for x in list(df)[1:]:
                #during 1st time thru loop: new column, with label, created in empty dataframe:
                #during subsequent time thru loop: new row, with row label, added to dataframe:
                corv.loc[x, y] = df[x].corr(df[list(df)[0]])
                
            corv = corv.loc[abs(corv).sort_values([y]).index, :] #corr(Xs, y) ranked

            delta = 0.005 #corr difference lower limit
            dl2 = []
            icorr = True
            while icorr:
                a = abs(corv).diff() <= delta #adjacent rows with similar abs(corr(Xs, y))
                colname = list(df)[1:]
                dl = []
                print('\nX pairs with correlations >', 1 - delta, ':')
                for b in range(1, a.shape[0]):
                    if a.iloc[b, 0]:
                        if abs(df[a.index[b - 1]].corr(df[a.index[b]])) > 1 - delta:
                            #deleting 1 X from correlated pair:
                            dv0 = a.index[b - 1]
                            dv1 = a.index[b]

                            #neither should already be deleted:
                            if not (dv0 in dl) and not (dv1 in dl):
                                #delete x with rather lower corr(x, y):
                                if abs(corv.loc[dv0, y]) - abs(corv.loc[dv1, y]) >= delta:
                                    d = dv1
                                elif len(dv0) < len(dv1): #delete x with longer name:
                                    d = dv1
                                else:
                                    d = dv0
                                    
                                dl.append(d) #for en masse deletion later
                                corv.drop([d], axis=0, inplace=True) #delete from column of corr with y

                                print(dv0,',',dv1)
            
                if len(dl) > 0:
                    df.drop(axis=1, columns=dl, inplace=True) #variables deleted en masse
                    dl2 = dl2 + dl #keep for real deletion later
                    print('\n' + str(len(dl)), 'variables considered for deletion:')
                    print('\n'.join([str(x) for x in dl]))
                else:
                    print('(no more)')
                    icorr = False
                    
            dl2 = [x for x in dl2 if x.find('_x_') != -1] #only interaction variables kept
            df2.drop(axis=1, columns=dl2, inplace=True) #collinear interaction variables deleted en masse, for real
            #remaining Xs may be collinear
            print('\n' + str(len(dl2)) + ' interaction variables deleted.')
            
            #potential collinearity issues handled
            
            
            modeleq = ' + '.join(list(df2)).replace('+', '~', 1)
            numx = df2.shape[1] - 1
            x1x2 = True #interaction variables already included
            
            #beyond-pairwise collinearity may still be introduced with the interaction variables
            
            df = df2.copy() #ready for continuing deletion
            continue

    #identify X variable to delete by finding the one with smallest abs(t-stat):
    t = regout.tvalues[1:]
    xdrop = list(t[abs(t) == min(abs(t))].index)[0]
    print('Variable to drop:', xdrop)
    
    df.drop(xdrop, axis=1, inplace=True)
    modeleq = ' + '.join(list(df)).replace('+', '~', 1)
    
    numx = numx - 1

numx = bmodeleq.count('+') + 1
print('\nBest model has', numx, 'Xs:')

out = ols(bmodeleq, df2).fit()
print(out.summary2())
Model equation: total_users ~ temp + humidity + wind_speed + season_1 + season_2 + season_3 + season_4 + year_0 + year_1 + month_1 + month_2 + month_3 + month_4 + month_5 + month_6 + month_7 + month_8 + month_9 + month_10 + month_11 + month_12 + day_1 + day_2 + day_3 + day_4 + day_5 + day_6 + day_7 + day_8 + day_9 + day_10 + day_11 + day_12 + day_13 + day_14 + day_15 + day_16 + day_17 + day_18 + day_19 + day_20 + day_21 + day_22 + day_23 + day_24 + day_25 + day_26 + day_27 + day_28 + day_29 + day_30 + day_31 + hour_0 + hour_1 + hour_2 + hour_3 + hour_4 + hour_5 + hour_6 + hour_7 + hour_8 + hour_9 + hour_10 + hour_11 + hour_12 + hour_13 + hour_14 + hour_15 + hour_16 + hour_17 + hour_18 + hour_19 + hour_20 + hour_21 + hour_22 + hour_23 + is_holiday_0 + is_holiday_1 + day_of_week_0 + day_of_week_1 + day_of_week_2 + day_of_week_3 + day_of_week_4 + day_of_week_5 + day_of_week_6 + is_working_day_0 + is_working_day_1 + weather_type_1 + weather_type_2 + weather_type_3 + weather_type_4 


Adjusted R2 = 0.6857802618443614 for 91 Xs.
Variable to drop: hour_4

Adjusted R2 = 0.6857753647419713 for 90 Xs.
Variable to drop: hour_3

Adjusted R2 = 0.6857935443437535 for 89 Xs.
Variable to drop: day_29

Adjusted R2 = 0.6857887667921927 for 88 Xs.
Variable to drop: day_28

Adjusted R2 = 0.6857978408297494 for 87 Xs.
Variable to drop: day_27

Adjusted R2 = 0.6858185981551397 for 86 Xs.
Variable to drop: day_1

Adjusted R2 = 0.6858372052929022 for 85 Xs.
Variable to drop: day_24

Adjusted R2 = 0.6858577764041636 for 84 Xs.
Variable to drop: day_25

Adjusted R2 = 0.6858685396832572 for 83 Xs.
Variable to drop: day_22

Adjusted R2 = 0.6858823889309964 for 82 Xs.
Variable to drop: day_8

Adjusted R2 = 0.6859017597018506 for 81 Xs.
Variable to drop: day_7

Adjusted R2 = 0.6859085836013283 for 80 Xs.
Variable to drop: month_9

Adjusted R2 = 0.685911169903872 for 79 Xs.
Variable to drop: weather_type_4

Adjusted R2 = 0.6859093574634461 for 78 Xs.
Variable to drop: weather_type_3

Adjusted R2 = 0.6859296868526099 for 77 Xs.
Variable to drop: season_1

Adjusted R2 = 0.6859068282704559 for 76 Xs.
Variable to drop: day_26

Adjusted R2 = 0.685943187392246 for 75 Xs.
Variable to drop: day_18

Adjusted R2 = 0.685933552182301 for 74 Xs.
Variable to drop: is_working_day_0

Adjusted R2 = 0.6859243754795237 for 73 Xs.
Variable to drop: year_1

Adjusted R2 = 0.685926157055642 for 72 Xs.
Variable to drop: day_of_week_0

Adjusted R2 = 0.6859257531509151 for 71 Xs.
Variable to drop: day_of_week_5

Adjusted R2 = 0.6859172330264544 for 70 Xs.
Variable to drop: is_holiday_0

Adjusted R2 = 0.6859394778348251 for 69 Xs.
Variable to drop: day_2

Adjusted R2 = 0.6859363613974276 for 68 Xs.
Variable to drop: day_30

Adjusted R2 = 0.6859333581192022 for 67 Xs.
Variable to drop: day_of_week_3

Adjusted R2 = 0.6859259451407438 for 66 Xs.
Variable to drop: day_of_week_4

Adjusted R2 = 0.6859290993719822 for 65 Xs.
Variable to drop: day_23

Adjusted R2 = 0.6859115206057089 for 64 Xs.
Variable to drop: day_5

Adjusted R2 = 0.6858917016908037 for 63 Xs.
Variable to drop: day_19

Adjusted R2 = 0.6858718667860461 for 62 Xs.
Variable to drop: day_31

Adjusted R2 = 0.685850097012868 for 61 Xs.
Variable to drop: day_14

Adjusted R2 = 0.6858228107518651 for 60 Xs.
Variable to drop: day_12

Adjusted R2 = 0.6857972892478064 for 59 Xs.
Variable to drop: day_3

Adjusted R2 = 0.685773165085928 for 58 Xs.
Variable to drop: day_10

Adjusted R2 = 0.6857535190994006 for 57 Xs.
Variable to drop: day_13

Adjusted R2 = 0.6857355903086282 for 56 Xs.
Variable to drop: day_9

Adjusted R2 = 0.6857147573783373 for 55 Xs.
Variable to drop: day_of_week_2

Adjusted R2 = 0.6856858128101659 for 54 Xs.
Variable to drop: day_of_week_1

Adjusted R2 = 0.6856613197619186 for 53 Xs.
Variable to drop: day_21

Adjusted R2 = 0.6856177942973182 for 52 Xs.
Variable to drop: day_6

Adjusted R2 = 0.6855790149250771 for 51 Xs.
Variable to drop: day_4

Adjusted R2 = 0.685538201827948 for 50 Xs.
Variable to drop: day_11

Adjusted R2 = 0.6854918228856638 for 49 Xs.
Variable to drop: month_5

Adjusted R2 = 0.6854453570997006 for 48 Xs.
Variable to drop: day_16

Adjusted R2 = 0.6853966998632316 for 47 Xs.
Variable to drop: month_3

Adjusted R2 = 0.6853331357539708 for 46 Xs.
Variable to drop: month_10

Adjusted R2 = 0.6853020368130502 for 45 Xs.
Variable to drop: day_15

Adjusted R2 = 0.6852266082364908 for 44 Xs.
Variable to drop: day_20

Adjusted R2 = 0.6851420085398512 for 43 Xs.
Variable to drop: hour_2

Adjusted R2 = 0.6850367692745023 for 42 Xs.
Variable to drop: hour_5

Adjusted R2 = 0.6849397007534863 for 41 Xs.
Variable to drop: month_2

Adjusted R2 = 0.6848283384743894 for 40 Xs.
Variable to drop: month_1

Adjusted R2 = 0.6847468621381798 for 39 Xs.
Variable to drop: month_4

Adjusted R2 = 0.6845651330323977 for 38 Xs.
Variable to drop: hour_1

Adjusted R2 = 0.6843837963931841 for 37 Xs.
Variable to drop: day_17

Adjusted R2 = 0.6841908193957087 for 36 Xs.
Variable to drop: is_holiday_1

Adjusted R2 = 0.683983662586047 for 35 Xs.
Variable to drop: month_6

Adjusted R2 = 0.6835433606457065 for 34 Xs.
Variable to drop: month_8

Adjusted R2 = 0.6831628778496036 for 33 Xs.
Variable to drop: wind_speed

Adjusted R2 = 0.6826746797820062 for 32 Xs.
Variable to drop: month_12

Adjusted R2 = 0.68206875483279 for 31 Xs.
Variable to drop: month_11

Adjusted R2 = 0.6814917799943856 for 30 Xs.
Variable to drop: day_of_week_6

Adjusted R2 = 0.6806869722358586 for 29 Xs.
Variable to drop: is_working_day_1

Adjusted R2 = 0.6804122984854217 for 28 Xs.
Variable to drop: hour_0

Adjusted R2 = 0.6795735846967551 for 27 Xs.
Variable to drop: season_3

Adjusted R2 = 0.6780386051055483 for 26 Xs.
Variable to drop: month_7

Adjusted R2 = 0.6769268731093461 for 25 Xs.
Variable to drop: humidity

Adjusted R2 = 0.6742791327252889 for 24 Xs.
Variable to drop: season_2

Adjusted R2 = 0.6707621672617614 for 23 Xs.
Variable to drop: hour_23

Adjusted R2 = 0.6671148253072732 for 22 Xs.
Variable to drop: hour_6

Adjusted R2 = 0.6643646429987768 for 21 Xs.
Variable to drop: weather_type_2

Adjusted R2 = 0.6567296820924805 for 20 Xs.
Variable to drop: weather_type_1

Adjusted R2 = 0.6492217316159239 for 19 Xs.
Variable to drop: hour_22

Adjusted R2 = 0.6414433345452661 for 18 Xs.
Variable to drop: season_4

Adjusted R2 = 0.6314280683391205 for 17 Xs.
Variable to drop: hour_10

Adjusted R2 = 0.6171153698006244 for 16 Xs.
Variable to drop: hour_21

Adjusted R2 = 0.605441619734016 for 15 Xs.
Variable to drop: hour_11

Adjusted R2 = 0.5889616120417926 for 14 Xs.
Variable to drop: hour_14

Adjusted R2 = 0.5696052222980719 for 13 Xs.
Variable to drop: hour_20

Adjusted R2 = 0.5525390681654606 for 12 Xs.
Variable to drop: hour_9

Adjusted R2 = 0.5369063856316603 for 11 Xs.
Variable to drop: hour_15

Adjusted R2 = 0.5225455906813559 for 10 Xs.
Variable to drop: hour_7

Adjusted R2 = 0.5093421345905575 for 9 Xs.
Variable to drop: hour_13

Adjusted R2 = 0.496565435629136 for 8 Xs.
Variable to drop: hour_12

Adjusted R2 = 0.48420325440135514 for 7 Xs.
Variable to drop: hour_16

Adjusted R2 = 0.4584580616887255 for 6 Xs.
Variable to drop: hour_19

Adjusted R2 = 0.4306768645856852 for 5 Xs.
Variable to drop: year_0

Adjusted R2 = 0.37514432903076966 for 4 Xs.
Variable to drop: hour_8

Adjusted R2 = 0.3185755715955978 for 3 Xs.
Variable to drop: hour_18

Adjusted R2 = 0.24739869560194716 for 2 Xs.
Variable to drop: hour_17

Adjusted R2 = 0.16379247649559414 for 1 Xs.
Variable left: temp

Restarting from best model (with 75 Xs) found so far...

X pairs with correlations > 0.995 :
is_working_day_0 , is_working_day_1
is_holiday_0 , is_holiday_1
year_1 , year_0

3 variables considered for deletion:
is_working_day_0
is_holiday_0
year_1

X pairs with correlations > 0.995 :
(no more)

0 interaction variables deleted.

Adjusted R2 = 0.685943187392246 for 75 Xs.
Variable to drop: day_18

Adjusted R2 = 0.685933552182301 for 74 Xs.
Variable to drop: is_working_day_0

Adjusted R2 = 0.6859243754795237 for 73 Xs.
Variable to drop: year_1

Adjusted R2 = 0.685926157055642 for 72 Xs.
Variable to drop: day_of_week_0

Adjusted R2 = 0.6859257531509151 for 71 Xs.
Variable to drop: day_of_week_5

Adjusted R2 = 0.6859172330264544 for 70 Xs.
Variable to drop: is_holiday_0

Adjusted R2 = 0.6859394778348251 for 69 Xs.
Variable to drop: day_2

Adjusted R2 = 0.6859363613974276 for 68 Xs.
Variable to drop: day_30

Adjusted R2 = 0.6859333581192022 for 67 Xs.
Variable to drop: day_of_week_3

Adjusted R2 = 0.6859259451407438 for 66 Xs.
Variable to drop: day_of_week_4

Adjusted R2 = 0.6859290993719822 for 65 Xs.
Variable to drop: day_23

Adjusted R2 = 0.6859115206057089 for 64 Xs.
Variable to drop: day_5

Adjusted R2 = 0.6858917016908037 for 63 Xs.
Variable to drop: day_19

Adjusted R2 = 0.6858718667860461 for 62 Xs.
Variable to drop: day_31

Adjusted R2 = 0.685850097012868 for 61 Xs.
Variable to drop: day_14

Adjusted R2 = 0.6858228107518651 for 60 Xs.
Variable to drop: day_12

Adjusted R2 = 0.6857972892478064 for 59 Xs.
Variable to drop: day_3

Adjusted R2 = 0.685773165085928 for 58 Xs.
Variable to drop: day_10

Adjusted R2 = 0.6857535190994006 for 57 Xs.
Variable to drop: day_13

Adjusted R2 = 0.6857355903086282 for 56 Xs.
Variable to drop: day_9

Adjusted R2 = 0.6857147573783373 for 55 Xs.
Variable to drop: day_of_week_2

Adjusted R2 = 0.6856858128101659 for 54 Xs.
Variable to drop: day_of_week_1

Adjusted R2 = 0.6856613197619186 for 53 Xs.
Variable to drop: day_21

Adjusted R2 = 0.6856177942973182 for 52 Xs.
Variable to drop: day_6

Adjusted R2 = 0.6855790149250771 for 51 Xs.
Variable to drop: day_4

Adjusted R2 = 0.685538201827948 for 50 Xs.
Variable to drop: day_11

Adjusted R2 = 0.6854918228856638 for 49 Xs.
Variable to drop: month_5

Adjusted R2 = 0.6854453570997006 for 48 Xs.
Variable to drop: day_16

Adjusted R2 = 0.6853966998632316 for 47 Xs.
Variable to drop: month_3

Adjusted R2 = 0.6853331357539708 for 46 Xs.
Variable to drop: month_10

Adjusted R2 = 0.6853020368130502 for 45 Xs.
Variable to drop: day_15

Adjusted R2 = 0.6852266082364908 for 44 Xs.
Variable to drop: day_20

Adjusted R2 = 0.6851420085398512 for 43 Xs.
Variable to drop: hour_2

Adjusted R2 = 0.6850367692745023 for 42 Xs.
Variable to drop: hour_5

Adjusted R2 = 0.6849397007534863 for 41 Xs.
Variable to drop: month_2

Adjusted R2 = 0.6848283384743894 for 40 Xs.
Variable to drop: month_1

Adjusted R2 = 0.6847468621381798 for 39 Xs.
Variable to drop: month_4

Adjusted R2 = 0.6845651330323977 for 38 Xs.
Variable to drop: hour_1

Adjusted R2 = 0.6843837963931841 for 37 Xs.
Variable to drop: day_17

Adjusted R2 = 0.6841908193957087 for 36 Xs.
Variable to drop: is_holiday_1

Adjusted R2 = 0.683983662586047 for 35 Xs.
Variable to drop: month_6

Adjusted R2 = 0.6835433606457065 for 34 Xs.
Variable to drop: month_8

Adjusted R2 = 0.6831628778496036 for 33 Xs.
Variable to drop: wind_speed

Adjusted R2 = 0.6826746797820062 for 32 Xs.
Variable to drop: month_12

Adjusted R2 = 0.68206875483279 for 31 Xs.
Variable to drop: month_11

Adjusted R2 = 0.6814917799943856 for 30 Xs.
Variable to drop: day_of_week_6

Adjusted R2 = 0.6806869722358586 for 29 Xs.
Variable to drop: is_working_day_1

Adjusted R2 = 0.6804122984854217 for 28 Xs.
Variable to drop: hour_0

Adjusted R2 = 0.6795735846967551 for 27 Xs.
Variable to drop: season_3

Adjusted R2 = 0.6780386051055483 for 26 Xs.
Variable to drop: month_7

Adjusted R2 = 0.6769268731093461 for 25 Xs.
Variable to drop: humidity

Adjusted R2 = 0.6742791327252889 for 24 Xs.
Variable to drop: season_2

Adjusted R2 = 0.6707621672617614 for 23 Xs.
Variable to drop: hour_23

Adjusted R2 = 0.6671148253072732 for 22 Xs.
Variable to drop: hour_6

Adjusted R2 = 0.6643646429987768 for 21 Xs.
Variable to drop: weather_type_2

Adjusted R2 = 0.6567296820924805 for 20 Xs.
Variable to drop: weather_type_1

Adjusted R2 = 0.6492217316159239 for 19 Xs.
Variable to drop: hour_22

Adjusted R2 = 0.6414433345452661 for 18 Xs.
Variable to drop: season_4

Adjusted R2 = 0.6314280683391205 for 17 Xs.
Variable to drop: hour_10

Adjusted R2 = 0.6171153698006244 for 16 Xs.
Variable to drop: hour_21

Adjusted R2 = 0.605441619734016 for 15 Xs.
Variable to drop: hour_11

Adjusted R2 = 0.5889616120417926 for 14 Xs.
Variable to drop: hour_14

Adjusted R2 = 0.5696052222980719 for 13 Xs.
Variable to drop: hour_20

Adjusted R2 = 0.5525390681654606 for 12 Xs.
Variable to drop: hour_9

Adjusted R2 = 0.5369063856316603 for 11 Xs.
Variable to drop: hour_15

Adjusted R2 = 0.5225455906813559 for 10 Xs.
Variable to drop: hour_7

Adjusted R2 = 0.5093421345905575 for 9 Xs.
Variable to drop: hour_13

Adjusted R2 = 0.496565435629136 for 8 Xs.
Variable to drop: hour_12

Adjusted R2 = 0.48420325440135514 for 7 Xs.
Variable to drop: hour_16

Adjusted R2 = 0.4584580616887255 for 6 Xs.
Variable to drop: hour_19

Adjusted R2 = 0.4306768645856852 for 5 Xs.
Variable to drop: year_0

Adjusted R2 = 0.37514432903076966 for 4 Xs.
Variable to drop: hour_8

Adjusted R2 = 0.3185755715955978 for 3 Xs.
Variable to drop: hour_18

Adjusted R2 = 0.24739869560194716 for 2 Xs.
Variable to drop: hour_17

Adjusted R2 = 0.16379247649559414 for 1 Xs.
Variable left: temp

Best model has 75 Xs:
                  Results: Ordinary least squares
====================================================================
Model:              OLS              Adj. R-squared:     0.686      
Dependent Variable: total_users      AIC:                210025.9981
Date:               2020-04-30 19:38 BIC:                210577.1723
No. Observations:   17379            Log-Likelihood:     -1.0494e+05
Df Model:           70               F-statistic:        543.2      
Df Residuals:       17308            Prob (F-statistic): 0.00       
R-squared:          0.687            Scale:              10333.     
--------------------------------------------------------------------
                  Coef.   Std.Err.    t     P>|t|   [0.025   0.975] 
--------------------------------------------------------------------
Intercept        -45.1976   3.8629 -11.7004 0.0000 -52.7693 -37.6259
temp             235.9979   9.4394  25.0015 0.0000 217.4958 254.5000
humidity         -79.8697   5.5831 -14.3056 0.0000 -90.8131 -68.9262
wind_speed       -36.4506   6.8609  -5.3128 0.0000 -49.8988 -23.0025
season_2          38.2830   4.8611   7.8754 0.0000  28.7548  47.8111
season_3          31.9511   5.7540   5.5529 0.0000  20.6727  43.2295
season_4          68.1640   4.8905  13.9380 0.0000  58.5781  77.7499
year_0           -65.2920   2.0885 -31.2634 0.0000 -69.3856 -61.1985
year_1            20.0944   2.0782   9.6692 0.0000  16.0210  24.1679
month_1          -29.8796   7.0011  -4.2679 0.0000 -43.6024 -16.1568
month_2          -26.1243   6.8112  -3.8355 0.0001 -39.4750 -12.7737
month_3          -15.6576   5.9646  -2.6251 0.0087 -27.3488  -3.9664
month_4          -23.4372   6.1027  -3.8404 0.0001 -35.3992 -11.4752
month_5          -10.8380   5.9763  -1.8135 0.0698 -22.5521   0.8761
month_6          -26.2350   5.0521  -5.1929 0.0000 -36.1377 -16.3323
month_7          -45.9884   4.2378 -10.8518 0.0000 -54.2950 -37.6818
month_8          -26.1276   4.1193  -6.3427 0.0000 -34.2019 -18.0533
month_10         -14.7933   5.4778  -2.7006 0.0069 -25.5303  -4.0563
month_11         -39.4260   5.8254  -6.7679 0.0000 -50.8443 -28.0076
month_12         -35.9879   5.7555  -6.2528 0.0000 -47.2693 -24.7066
day_2              5.2592   4.4636   1.1783 0.2387  -3.4898  14.0082
day_3             10.3484   4.4710   2.3146 0.0206   1.5848  19.1119
day_4             13.0489   4.4708   2.9187 0.0035   4.2856  21.8123
day_5              8.2446   4.4565   1.8500 0.0643  -0.4906  16.9799
day_6             12.8414   4.4740   2.8702 0.0041   4.0719  21.6109
day_9             10.9010   4.4607   2.4438 0.0145   2.1575  19.6445
day_10            10.3542   4.4766   2.3129 0.0207   1.5795  19.1288
day_11            13.9762   4.4888   3.1136 0.0019   5.1778  22.7746
day_12            10.2309   4.4681   2.2898 0.0220   1.4730  18.9888
day_13            10.4281   4.4692   2.3333 0.0196   1.6680  19.1882
day_14             9.7428   4.4634   2.1828 0.0291   0.9940  18.4915
day_15            16.0814   4.4615   3.6045 0.0003   7.3365  24.8263
day_16            14.1658   4.4671   3.1711 0.0015   5.4098  22.9217
day_17            22.1821   4.4549   4.9792 0.0000  13.4500  30.9142
day_18             4.9387   4.5001   1.0975 0.2724  -3.8819  13.7594
day_19             8.6750   4.4634   1.9436 0.0520  -0.0736  17.4237
day_20            16.9091   4.4620   3.7895 0.0002   8.1630  25.6552
day_21            12.6008   4.4833   2.8106 0.0050   3.8131  21.3886
day_23             7.3737   4.4672   1.6506 0.0988  -1.3824  16.1298
day_30             5.9226   4.7147   1.2562 0.2091  -3.3188  15.1639
day_31            11.1928   5.7863   1.9344 0.0531  -0.1489  22.5344
hour_0            38.7379   4.6641   8.3056 0.0000  29.5959  47.8800
hour_1            21.3418   4.6623   4.5775 0.0000  12.2031  30.4804
hour_2            12.2399   4.6781   2.6164 0.0089   3.0704  21.4095
hour_5            15.1663   4.6731   3.2455 0.0012   6.0066  24.3261
hour_6            74.0873   4.6565  15.9104 0.0000  64.9600  83.2146
hour_7           209.2163   4.6521  44.9729 0.0000 200.0978 218.3348
hour_8           349.7594   4.6588  75.0755 0.0000 340.6278 358.8911
hour_9           202.0554   4.6861  43.1176 0.0000 192.8701 211.2407
hour_10          147.2337   4.7342  31.1003 0.0000 137.9542 156.5131
hour_11          172.6397   4.8007  35.9612 0.0000 163.2298 182.0496
hour_12          211.9689   4.8705  43.5210 0.0000 202.4223 221.5156
hour_13          206.9137   4.9254  42.0098 0.0000 197.2595 216.5679
hour_14          191.0489   4.9681  38.4553 0.0000 181.3109 200.7868
hour_15          200.4175   4.9828  40.2221 0.0000 190.6508 210.1842
hour_16          262.4513   4.9658  52.8513 0.0000 252.7177 272.1848
hour_17          416.1298   4.9220  84.5453 0.0000 406.4823 425.7774
hour_18          384.2972   4.8687  78.9316 0.0000 374.7540 393.8404
hour_19          275.7436   4.7943  57.5148 0.0000 266.3463 285.1409
hour_20          196.2114   4.7463  41.3396 0.0000 186.9082 205.5147
hour_21          146.7913   4.7071  31.1851 0.0000 137.5649 156.0177
hour_22          109.7684   4.6851  23.4290 0.0000 100.5851 118.9518
hour_23           70.9060   4.6716  15.1781 0.0000  61.7492  80.0629
is_holiday_0     -15.2950   2.5970  -5.8895 0.0000 -20.3854 -10.2047
is_holiday_1     -29.9026   2.8519 -10.4851 0.0000 -35.4926 -24.3126
day_of_week_0     -7.9209   1.6619  -4.7662 0.0000 -11.1784  -4.6635
day_of_week_1    -12.5137   1.9812  -6.3162 0.0000 -16.3970  -8.6304
day_of_week_2    -11.0762   2.0677  -5.3567 0.0000 -15.1291  -7.0232
day_of_week_3     -8.3542   2.0534  -4.0685 0.0000 -12.3790  -4.3293
day_of_week_4     -8.7112   2.0321  -4.2869 0.0000 -12.6942  -4.7281
day_of_week_5     -4.9343   2.0005  -2.4665 0.0137  -8.8555  -1.0131
day_of_week_6      8.3128   1.6562   5.0193 0.0000   5.0665  11.5590
is_working_day_0 -29.5107   2.3576 -12.5175 0.0000 -34.1318 -24.8897
is_working_day_1 -15.6869   1.7953  -8.7379 0.0000 -19.2058 -12.1680
weather_type_1    65.5922   3.2330  20.2884 0.0000  59.2552  71.9291
weather_type_2    54.6646   3.2299  16.9244 0.0000  48.3336  60.9955
--------------------------------------------------------------------
Omnibus:          1109.936    Durbin-Watson:       0.507            
Prob(Omnibus):    0.000       Jarque-Bera (JB):    2227.712         
Skew:             0.448       Prob(JB):            0.000            
Kurtosis:         4.508       Condition No.:       19756029393318504
====================================================================
* The condition number is large (2e+16). This might indicate
strong multicollinearity or other numerical problems.
In [11]:
# Split training and testing set for ols model after variable selection
data = df2.copy()

x1 = data.iloc[:,1:]
y1 = data.iloc[:,0:1]

x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, 
                                                        test_size=0.3,
                                                        shuffle=True,
                                                        random_state=42)

training_set = pd.concat([y1_train, x1_train], axis=1)
training_set
Out[11]:
total_users temp humidity wind_speed season_2 season_3 season_4 year_0 year_1 month_1 ... day_of_week_1 day_of_week_2 day_of_week_3 day_of_week_4 day_of_week_5 day_of_week_6 is_working_day_0 is_working_day_1 weather_type_1 weather_type_2
11229 9.0 0.50 0.42 0.1940 1 0 0 0 1 0 ... 0 0 1 0 0 0 0 1 0 1
6334 209.0 0.68 0.79 0.1642 0 0 1 1 0 0 ... 1 0 0 0 0 0 0 1 0 1
11060 30.0 0.34 0.36 0.4179 1 0 0 0 1 0 ... 0 0 1 0 0 0 0 1 1 0
12346 644.0 0.70 0.34 0.0000 1 0 0 0 1 0 ... 0 0 0 0 0 0 1 0 1 0
15437 217.0 0.56 0.52 0.2985 0 0 1 0 1 0 ... 0 0 1 0 0 0 0 1 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
11284 359.0 0.46 0.88 0.0896 1 0 0 0 1 0 ... 0 0 0 0 1 0 0 1 1 0
11964 812.0 0.66 0.34 0.1343 1 0 0 0 1 0 ... 0 0 0 0 1 0 0 1 1 0
5390 189.0 0.80 0.43 0.2836 0 1 0 1 0 0 ... 0 0 1 0 0 0 0 1 1 0
860 100.0 0.24 0.65 0.4179 0 0 0 1 0 0 ... 0 1 0 0 0 0 0 1 1 0
15795 779.0 0.52 0.83 0.1642 0 0 1 0 1 0 ... 0 0 0 1 0 0 0 1 0 1

12165 rows × 76 columns

In [12]:
rmse_comparison = []

reg_eqn = ' + '.join(list(df2.columns)).replace('+', '~', 1)
print('Model equation:\n', reg_eqn)

reg_model = ols(reg_eqn, training_set).fit()
y_pred = reg_model.predict(x1_test)
rmse = np.sqrt(mean_squared_error(y_pred, y1_test))
rmse_comparison.append(rmse)

print('\nRMSE (Linear Regression) =', rmse)
Model equation:
 total_users ~ temp + humidity + wind_speed + season_2 + season_3 + season_4 + year_0 + year_1 + month_1 + month_2 + month_3 + month_4 + month_5 + month_6 + month_7 + month_8 + month_10 + month_11 + month_12 + day_2 + day_3 + day_4 + day_5 + day_6 + day_9 + day_10 + day_11 + day_12 + day_13 + day_14 + day_15 + day_16 + day_17 + day_18 + day_19 + day_20 + day_21 + day_23 + day_30 + day_31 + hour_0 + hour_1 + hour_2 + hour_5 + hour_6 + hour_7 + hour_8 + hour_9 + hour_10 + hour_11 + hour_12 + hour_13 + hour_14 + hour_15 + hour_16 + hour_17 + hour_18 + hour_19 + hour_20 + hour_21 + hour_22 + hour_23 + is_holiday_0 + is_holiday_1 + day_of_week_0 + day_of_week_1 + day_of_week_2 + day_of_week_3 + day_of_week_4 + day_of_week_5 + day_of_week_6 + is_working_day_0 + is_working_day_1 + weather_type_1 + weather_type_2

RMSE (Linear Regression) = 100.55924534210169

Linear Regression

In [13]:
# Split training and testing set for the remaining regression models
data = df_regression.copy()

x = data.iloc[:,1:]
y = data.iloc[:,0:1]

x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size = 0.3, 
                                                    shuffle=True, 
                                                    random_state=42)
In [14]:
linear_regr = LinearRegression(fit_intercept=True,
                               normalize=False, 
                               copy_X=True, 
                               n_jobs=-1)

linear_regr.fit(x_train, y_train)
y_pred = linear_regr.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_pred, y_test))
rmse_comparison.append(rmse)

print('RMSE (Linear Regression) =',rmse)
RMSE (Linear Regression) = 100.61777150034399

Lasso Regression

In [15]:
alpha = np.linspace(0,10,1001)
rmse_all = []

for i in alpha:
    lasso_regr = Lasso(alpha=i, 
                       fit_intercept=True,
                       normalize=False,
                       copy_X=True,
                       max_iter=100,
                       tol=0.01,
                       random_state=42)
    lasso_regr.fit(x_train, y_train)
    y_pred = lasso_regr.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_pred, y_test))
    rmse_all.append(rmse)

index = np.argmin(rmse_all)
best_alpha = alpha[index]
best_rmse = rmse_all[index]
rmse_comparison.append(best_rmse)

plt.plot(alpha, rmse_all)
plt.xlabel("alpha")
plt.ylabel("rmse")
plt.show()

print('Best alpha =', best_alpha)
print('Best RMSE (Lasso Regression) =', best_rmse)
Best alpha = 0.07
Best RMSE (Lasso Regression) = 100.47701590657323

Ridge Regression

In [16]:
alpha = np.linspace(0,10,1001)
rmse_all = []

for i in alpha:
    ridge_regr = Ridge(alpha=i,
                       fit_intercept=True,
                       normalize=False,
                       copy_X=True,
                       max_iter=100,
                       tol=0.01,
                       random_state=42)
    ridge_regr.fit(x_train, y_train)
    y_pred = ridge_regr.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_pred, y_test))
    rmse_all.append(rmse)

index = np.argmin(rmse_all)
best_alpha = alpha[index]
best_rmse = rmse_all[index]
rmse_comparison.append(best_rmse)

plt.plot(alpha, rmse_all)
plt.xlabel("alpha")
plt.ylabel("rmse")
plt.show()

print('Best alpha =', best_alpha)
print('Best RMSE (Ridge Regression) =', best_rmse)
Best alpha = 3.49
Best RMSE (Ridge Regression) = 100.58650824422048

ElasticNet Regression

In [17]:
enet_tm = time.time()

alpha = np.linspace(0,5,501)
ratio = np.linspace(0,1,11)

best_alpha = None
best_ratio = None
best_rmse = 1e+8

for i in alpha:
    for j in ratio:
        elastic_net = ElasticNet(alpha=i,
                                 l1_ratio=j,
                                 fit_intercept=True,
                                 normalize=False,
                                 max_iter=100,
                                 tol=0.01,
                                 copy_X=True,
                                 random_state=42)
        elastic_net.fit(x_train, y_train)
        y_pred = elastic_net.predict(x_test)
        rmse = np.sqrt(mean_squared_error(y_pred, y_test))

        if rmse < best_rmse:
            best_alpha = i
            best_ratio = j
            best_rmse = rmse

rmse_comparison.append(best_rmse)

print('Best alpha =', best_alpha)
print('Best ratio =', best_ratio)
print('Best RMSE (Elastic Net) =', best_rmse)
print('Runtime:', round((time.time()-enet_tm)/60,2), 'mins')
Best alpha = 0.07
Best ratio = 1.0
Best RMSE (Elastic Net) = 100.47701590657323
Runtime: 22.86 mins

Random Forest Regression

In [18]:
# Grid Search CV for best RFR model hyperparameters
rfr_tm = time.time()

rfr = RandomForestRegressor(random_state=42)

parametersGrid = { 
    'n_estimators': [100, 200, 500],
    'max_features': ['auto','sqrt','log2']
    }

best_rfr = GridSearchCV(estimator=rfr, param_grid=parametersGrid, cv=5, n_jobs=-1)
best_rfr.fit(x_train, y_train)

print('Runtime:', round((time.time()-rfr_tm)/60,2), 'mins')
best_rfr.best_params_
Runtime: 13.12 mins
Out[18]:
{'max_features': 'auto', 'n_estimators': 500}
In [19]:
y_pred = best_rfr.predict(x_test)

rmse = np.sqrt(mean_squared_error(y_pred, y_test))
rmse_comparison.append(rmse)

print('Best RMSE (Random Forest Regression) =', rmse)
Best RMSE (Random Forest Regression) = 47.42456301408691
In [20]:
# Comparing different linear model performances
df_rmse = pd.DataFrame({'RMSE':rmse_comparison}, index=['OLS (Variable Selection)',
                                                        'Linear Regression',
                                                        'Lasso Regression',
                                                        'Ridge Regression',
                                                        'ElasticNet Regression',
                                                        'Random Forest Regression'])
df_rmse
Out[20]:
RMSE
OLS (Variable Selection) 100.559245
Linear Regression 100.617772
Lasso Regression 100.477016
Ridge Regression 100.586508
ElasticNet Regression 100.477016
Random Forest Regression 47.424563

Classification Analysis

Classification Data Preparation

In [21]:
# Selecting features that are relevant to total bike usage
df_classifier = df_master.loc[:,'hour':].copy()
df_classifier.dropna(axis=0, how='any', inplace=True)
df_classifier.reset_index(drop=True, inplace=True)

# Create total bike usage and 
df_classifier['total_users'] = df_classifier['casual'] + df_classifier['registered']
df_classifier.drop(columns=['app_temp',
                            'casual',
                            'registered'], inplace=True)

df_classifier['weather_type'] = df_classifier['weather_type'].astype('int64')

df_classifier.describe()
Out[21]:
hour is_holiday day_of_week is_working_day weather_type temp humidity wind_speed total_users
count 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000
mean 11.546752 0.028770 3.003683 0.682721 1.425283 0.496987 0.627229 0.190098 189.463088
std 6.914405 0.167165 2.005771 0.465431 0.639357 0.192556 0.192930 0.122340 181.387599
min 0.000000 0.000000 0.000000 0.000000 1.000000 0.020000 0.000000 0.000000 1.000000
25% 6.000000 0.000000 1.000000 0.000000 1.000000 0.340000 0.480000 0.104500 40.000000
50% 12.000000 0.000000 3.000000 1.000000 1.000000 0.500000 0.630000 0.194000 142.000000
75% 18.000000 0.000000 5.000000 1.000000 2.000000 0.660000 0.780000 0.253700 281.000000
max 23.000000 1.000000 6.000000 1.000000 4.000000 1.000000 1.000000 0.850700 977.000000
In [22]:
# Visualizing Total User Count for feature engineering
plt.figure(figsize=(10,6))
plt.xlabel('total_users')
plt.title('Total Bike User Count')
df_classifier.total_users.plot.hist(bins=[0,200,600,1000])
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x1c30cfab148>
In [23]:
# Feature Engineering (Creating 'Low', 'Moderate' and 'High' User Traffic from User Count)
df_classifier['user_traffic'] = ''

for i in range(len(df_classifier)):
    temp = df_classifier['total_users'][i]

    if temp < 200:
        df_classifier['user_traffic'][i] = 'Low'
    elif temp < 600:
        df_classifier['user_traffic'][i] = 'Moderate'
    else:
        df_classifier['user_traffic'][i] = 'High'

df_classifier.drop(columns=['total_users'], inplace=True)
df_classifier = pd.concat([df_classifier.pop('user_traffic'),df_classifier], axis=1)

print(df_classifier['user_traffic'].value_counts())
df_classifier
Low         10763
Moderate     5949
High          667
Name: user_traffic, dtype: int64
Out[23]:
user_traffic hour is_holiday day_of_week is_working_day weather_type temp humidity wind_speed
0 Low 0 0 6 0 1 0.24 0.81 0.0000
1 Low 1 0 6 0 1 0.22 0.80 0.0000
2 Low 2 0 6 0 1 0.22 0.80 0.0000
3 Low 3 0 6 0 1 0.24 0.75 0.0000
4 Low 4 0 6 0 1 0.24 0.75 0.0000
... ... ... ... ... ... ... ... ... ...
17374 Low 19 0 1 1 2 0.26 0.60 0.1642
17375 Low 20 0 1 1 2 0.26 0.60 0.1642
17376 Low 21 0 1 1 1 0.26 0.60 0.1642
17377 Low 22 0 1 1 1 0.26 0.56 0.1343
17378 Low 23 0 1 1 1 0.26 0.65 0.1343

17379 rows × 9 columns

In [24]:
# Creating categorical dummy variables
cat_vars = list(df_classifier.columns)[1:-3]
num_vars = list(df_classifier.columns)[-3:]

quantitative_data = df_classifier[num_vars].copy()
categorical_data = df_classifier[cat_vars].copy()
categorical_data.astype('category')

dummy = pd.get_dummies(categorical_data, columns=list(categorical_data.columns))
df_classifier = pd.concat([df_classifier[['user_traffic']],quantitative_data,dummy],axis=1)
df_classifier
Out[24]:
user_traffic temp humidity wind_speed hour_0 hour_1 hour_2 hour_3 hour_4 hour_5 ... day_of_week_3 day_of_week_4 day_of_week_5 day_of_week_6 is_working_day_0 is_working_day_1 weather_type_1 weather_type_2 weather_type_3 weather_type_4
0 Low 0.24 0.81 0.0000 1 0 0 0 0 0 ... 0 0 0 1 1 0 1 0 0 0
1 Low 0.22 0.80 0.0000 0 1 0 0 0 0 ... 0 0 0 1 1 0 1 0 0 0
2 Low 0.22 0.80 0.0000 0 0 1 0 0 0 ... 0 0 0 1 1 0 1 0 0 0
3 Low 0.24 0.75 0.0000 0 0 0 1 0 0 ... 0 0 0 1 1 0 1 0 0 0
4 Low 0.24 0.75 0.0000 0 0 0 0 1 0 ... 0 0 0 1 1 0 1 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
17374 Low 0.26 0.60 0.1642 0 0 0 0 0 0 ... 0 0 0 0 0 1 0 1 0 0
17375 Low 0.26 0.60 0.1642 0 0 0 0 0 0 ... 0 0 0 0 0 1 0 1 0 0
17376 Low 0.26 0.60 0.1642 0 0 0 0 0 0 ... 0 0 0 0 0 1 1 0 0 0
17377 Low 0.26 0.56 0.1343 0 0 0 0 0 0 ... 0 0 0 0 0 1 1 0 0 0
17378 Low 0.26 0.65 0.1343 0 0 0 0 0 0 ... 0 0 0 0 0 1 1 0 0 0

17379 rows × 43 columns

In [25]:
# Split train and test set for classification cross validation
x = df_classifier.iloc[:,1:].copy()
y = df_classifier.iloc[:,0:1].copy()

x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    random_state=42)

K-NN Classification

In [26]:
# K-NN Classification
classifier_comparison = []

knn_classifier = KNeighborsClassifier(n_neighbors=10,
                                      weights='distance',
                                      algorithm='auto',
                                      p=2,
                                      n_jobs=-1)

knn_classifier.fit(x_train, y_train)

print('Accuracy of fit =', knn_classifier.score(x_train, y_train))
print('Accuracy of prediction =', knn_classifier.score(x_test, y_test), '\n')
classifier_comparison.append(knn_classifier.score(x_test, y_test))

tab = pd.crosstab(knn_classifier.predict(x_test), y_test['user_traffic'], margins=True)
tab.index.name = 'Prediction'
tab
Accuracy of fit = 0.9991779695848746
Accuracy of prediction = 0.825469888761028 

Out[26]:
user_traffic High Low Moderate All
Prediction
High 63 0 79 142
Low 1 2862 282 3145
Moderate 116 432 1379 1927
All 180 3294 1740 5214

Decision Tree Classification

In [27]:
# Decision Tree Classification
decision_tree_classifier = DecisionTreeClassifier(criterion='gini',
                                                  splitter='best',
                                                  random_state=42)

decision_tree_classifier.fit(x_train, y_train)

print('Accuracy of Fit (Decision Tree) =', decision_tree_classifier.score(x_train, y_train))
print('Accuracy of Prediction (Decision Tree) =', decision_tree_classifier.score(x_test, y_test), '\n')
classifier_comparison.append(decision_tree_classifier.score(x_test, y_test))

tab2 = pd.crosstab(decision_tree_classifier.predict(x_test), y_test['user_traffic'], margins=True)
tab2.index.name = 'Prediction'
tab2
Accuracy of Fit (Decision Tree) = 0.9991779695848746
Accuracy of Prediction (Decision Tree) = 0.8091676256233218 

Out[27]:
user_traffic High Low Moderate All
Prediction
High 81 4 120 205
Low 1 2881 363 3245
Moderate 98 409 1257 1764
All 180 3294 1740 5214

Decision Tree Classification (with Pruning)

In [28]:
# Tuning Max Depth hyperparameter to reduce model complexity and determining 'gini' vs 'entropy' criterion
# for best model performance
gini_accuracy = []
entropy_accuracy = []
depth_range = np.arange(1,31)

for i in depth_range:
    dt_classifier = DecisionTreeClassifier(criterion='gini',
                                           splitter='best',
                                           max_depth=i,
                                           random_state=42)
    dt_classifier.fit(x_train, y_train)
    g_acc = dt_classifier.score(x_test, y_test)
    gini_accuracy.append(g_acc)

    dt_classifier = DecisionTreeClassifier(criterion='entropy',
                                           splitter='best',
                                           max_depth=i,
                                           random_state=42)
    dt_classifier.fit(x_train, y_train)
    e_acc = dt_classifier.score(x_test, y_test)
    entropy_accuracy.append(e_acc)

plt.figure(figsize=(12,8))
plt.plot(depth_range, gini_accuracy, label='gini')
plt.plot(depth_range, entropy_accuracy, label='entropy')
plt.title('Max Depth vs Accuracy')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.show()
In [29]:
# Comparing Accuracy Scores across both models for different Max Depth levels
df_pruning = pd.DataFrame({'Max Depth':depth_range,
                           'Gini Accuracy':gini_accuracy,
                           'Entropy Accuracy':entropy_accuracy})
df_pruning
Out[29]:
Max Depth Gini Accuracy Entropy Accuracy
0 1 0.631761 0.631761
1 2 0.706751 0.713656
2 3 0.718259 0.715957
3 4 0.739356 0.725547
4 5 0.756425 0.733602
5 6 0.771193 0.744534
6 7 0.785194 0.752397
7 8 0.794208 0.755082
8 9 0.796701 0.762179
9 10 0.795550 0.768316
10 11 0.800345 0.774837
11 12 0.809359 0.784235
12 13 0.807633 0.791523
13 14 0.811086 0.795359
14 15 0.815689 0.797660
15 16 0.817031 0.798236
16 17 0.817990 0.802263
17 18 0.821634 0.810127
18 19 0.818757 0.808976
19 20 0.819908 0.810702
20 21 0.813195 0.808017
21 22 0.814730 0.809168
22 23 0.811277 0.808976
23 24 0.812812 0.811661
24 25 0.809935 0.808592
25 26 0.813195 0.810127
26 27 0.811661 0.810510
27 28 0.814346 0.812236
28 29 0.810127 0.808209
29 30 0.808017 0.808592
In [30]:
# Tuning Minimum Samples required for a split to prevent model overfitting
gini_accuracy2 = []
min_split_range = np.arange(2,101)

for i in min_split_range:
    dt_classifier = DecisionTreeClassifier(criterion='gini',
                                           splitter='best',
                                           max_depth=18,
                                           min_samples_split=i,
                                           random_state=42)
    dt_classifier.fit(x_train, y_train)
    g_acc = dt_classifier.score(x_test, y_test)
    gini_accuracy2.append(g_acc)

plt.figure(figsize=(12,8))
plt.plot(min_split_range, gini_accuracy2, label='gini')
plt.title('Max Depth vs Accuracy')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.show()
In [31]:
# Comparing Model Accuracy across different Minimum Samples levels
df_pruning2 = pd.DataFrame({'Minimum Splits':min_split_range,
                            'Gini Accuracy':gini_accuracy2})
df_pruning2.head(50)
Out[31]:
Minimum Splits Gini Accuracy
0 2 0.821634
1 3 0.823168
2 4 0.820483
3 5 0.821250
4 6 0.822977
5 7 0.821634
6 8 0.820867
7 9 0.821250
8 10 0.823360
9 11 0.824703
10 12 0.825086
11 13 0.824895
12 14 0.826621
13 15 0.825278
14 16 0.827196
15 17 0.826621
16 18 0.828347
17 19 0.828922
18 20 0.829689
19 21 0.829306
20 22 0.830073
21 23 0.831224
22 24 0.829498
23 25 0.830073
24 26 0.830840
25 27 0.830648
26 28 0.831415
27 29 0.832374
28 30 0.832950
29 31 0.832374
30 32 0.831415
31 33 0.833142
32 34 0.833909
33 35 0.833909
34 36 0.834868
35 37 0.835251
36 38 0.836210
37 39 0.836210
38 40 0.835059
39 41 0.834292
40 42 0.834100
41 43 0.833717
42 44 0.833717
43 45 0.834100
44 46 0.834484
45 47 0.834100
46 48 0.833717
47 49 0.834484
48 50 0.834292
49 51 0.834292
In [32]:
# Building Pruned Decision Tree
dt_pruned = DecisionTreeClassifier(criterion='gini',
                                   splitter='best',
                                   max_depth=18,
                                   min_samples_split=39,
                                   random_state=42)
dt_pruned.fit(x_train, y_train)

print('Accuracy of Fit (Decision Tree with Pruning) =', dt_pruned.score(x_train, y_train))
print('Accuracy of Prediction (Decision Tree with Pruning) =', dt_pruned.score(x_test, y_test), '\n')
classifier_comparison.append(dt_pruned.score(x_test, y_test))

tab3 = pd.crosstab(dt_pruned.predict(x_test), y_test['user_traffic'], margins=True)
tab3.index.name = 'Prediction'
tab3
Accuracy of Fit (Decision Tree with Pruning) = 0.8461981093300452
Accuracy of Prediction (Decision Tree with Pruning) = 0.8362102032988109 

Out[32]:
user_traffic High Low Moderate All
Prediction
High 70 0 69 139
Low 1 2892 273 3166
Moderate 109 402 1398 1909
All 180 3294 1740 5214
In [33]:
# Decision Tree Visualization
target_classes = sorted(list(df_classifier.user_traffic.value_counts().index))

dot_data = export_graphviz(dt_pruned,
                           feature_names=list(df_classifier.columns)[1:],
                           class_names=target_classes,
                           label='all',
                           filled=True,
                           impurity=True,
                           rounded=True,
                           special_characters=True,
                           precision=3)

(graph,) = pydot.graph_from_dot_data(dot_data)

print('Classify \''+list(df_classifier.columns)[0]+'\' with',
      len(df_classifier),
      'classes',
      str(target_classes))

Image(graph.create_png())
Classify 'user_traffic' with 17379 classes ['High', 'Low', 'Moderate']
Out[33]:
In [34]:
# Comparison of Classification Model Accuracy
df_accuracy = pd.DataFrame({'Accuracy':classifier_comparison}, index=['K-NN Classification',
                                                                      'Decision Tree Classification',
                                                                      'Decision Tree Classification (with Pruning)'])
df_accuracy
Out[34]:
Accuracy
K-NN Classification 0.825470
Decision Tree Classification 0.809168
Decision Tree Classification (with Pruning) 0.836210

Clustering Analysis

Clustering Data Preparation

In [35]:
# Data preparation
df_cluster = df_master.copy()
df_cluster.dropna(axis=0, how='any', inplace=True)
df_cluster.reset_index(drop=True, inplace=True)
df_cluster.drop(columns=['temp'], inplace=True)

for col in ['casual','registered']:
    df_cluster[col] = df_cluster[col].astype('int64')

df_cluster['weather_type'] = df_cluster['weather_type'].astype('int64')
df_cluster.drop(columns=['time'], inplace=True)
df_cluster
Out[35]:
season year month day hour is_holiday day_of_week is_working_day weather_type app_temp humidity wind_speed casual registered
0 1 0 1 1 0 0 6 0 1 0.2879 0.81 0.0000 3 13
1 1 0 1 1 1 0 6 0 1 0.2727 0.80 0.0000 8 32
2 1 0 1 1 2 0 6 0 1 0.2727 0.80 0.0000 5 27
3 1 0 1 1 3 0 6 0 1 0.2879 0.75 0.0000 3 10
4 1 0 1 1 4 0 6 0 1 0.2879 0.75 0.0000 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
17374 1 1 12 31 19 0 1 1 2 0.2576 0.60 0.1642 11 108
17375 1 1 12 31 20 0 1 1 2 0.2576 0.60 0.1642 8 81
17376 1 1 12 31 21 0 1 1 1 0.2576 0.60 0.1642 7 83
17377 1 1 12 31 22 0 1 1 1 0.2727 0.56 0.1343 13 48
17378 1 1 12 31 23 0 1 1 1 0.2727 0.65 0.1343 12 37

17379 rows × 14 columns

In [36]:
# Selecting features for Clustering
x = df_cluster.loc[:,'app_temp':'wind_speed'].copy()
x
Out[36]:
app_temp humidity wind_speed
0 0.2879 0.81 0.0000
1 0.2727 0.80 0.0000
2 0.2727 0.80 0.0000
3 0.2879 0.75 0.0000
4 0.2879 0.75 0.0000
... ... ... ...
17374 0.2576 0.60 0.1642
17375 0.2576 0.60 0.1642
17376 0.2576 0.60 0.1642
17377 0.2727 0.56 0.1343
17378 0.2727 0.65 0.1343

17379 rows × 3 columns

Elbow Method

In [37]:
# Elbow Method for K-Means Clustering
cluster_range = np.arange(1,11)
wcss_list = []

for i in cluster_range:
    k_means = KMeans(n_clusters=i,
                     init='k-means++',
                     copy_x=True,
                     n_jobs=-1,
                     random_state=42)
    k_means.fit(x)
    print('Cluster(s) =',str(i)+',','WCSS =',k_means.inertia_)
    wcss_list.append(k_means.inertia_)

plt.rcParams['figure.figsize'] = 15, 7
plt.plot(cluster_range, wcss_list)
plt.title('Elbow Method for choosing K-Means Clusters', fontweight='bold')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-Cluster Sum of Squares')
plt.show()
Cluster(s) = 1, WCSS = 1420.1570787428068
Cluster(s) = 2, WCSS = 933.6838657101664
Cluster(s) = 3, WCSS = 660.1578078569673
Cluster(s) = 4, WCSS = 551.4199585520407
Cluster(s) = 5, WCSS = 487.0134426350962
Cluster(s) = 6, WCSS = 439.787541999362
Cluster(s) = 7, WCSS = 402.54604954533
Cluster(s) = 8, WCSS = 368.6308448952939
Cluster(s) = 9, WCSS = 341.1699757880593
Cluster(s) = 10, WCSS = 319.7318889442379

K-Means Clustering

In [38]:
# Performing K-Means Clustering with 3 clusters
k_means = KMeans(n_clusters=3,
                 init='k-means++',
                 copy_x=True,
                 n_jobs=-1,
                 random_state=42)
cluster_pred = pd.Series(k_means.fit_predict(x))

cluster_results = pd.DataFrame({'Cluster':cluster_pred.values})
cluster_results = pd.concat([df_cluster,cluster_results], axis=1)

print(cluster_results.Cluster.value_counts())
cluster_results
1    7303
0    5119
2    4957
Name: Cluster, dtype: int64
Out[38]:
season year month day hour is_holiday day_of_week is_working_day weather_type app_temp humidity wind_speed casual registered Cluster
0 1 0 1 1 0 0 6 0 1 0.2879 0.81 0.0000 3 13 1
1 1 0 1 1 1 0 6 0 1 0.2727 0.80 0.0000 8 32 1
2 1 0 1 1 2 0 6 0 1 0.2727 0.80 0.0000 5 27 1
3 1 0 1 1 3 0 6 0 1 0.2879 0.75 0.0000 3 10 1
4 1 0 1 1 4 0 6 0 1 0.2879 0.75 0.0000 0 1 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
17374 1 1 12 31 19 0 1 1 2 0.2576 0.60 0.1642 11 108 2
17375 1 1 12 31 20 0 1 1 2 0.2576 0.60 0.1642 8 81 2
17376 1 1 12 31 21 0 1 1 1 0.2576 0.60 0.1642 7 83 2
17377 1 1 12 31 22 0 1 1 1 0.2727 0.56 0.1343 13 48 2
17378 1 1 12 31 23 0 1 1 1 0.2727 0.65 0.1343 12 37 2

17379 rows × 15 columns

Clusters Visualization

In [39]:
# Cluster Distribution
cluster_dist = cluster_results.groupby(by=['Cluster'], as_index=False)
cluster_dist = cluster_dist[['casual','registered']].sum()
cluster_dist
Out[39]:
Cluster casual registered
0 0 378924 1221008
1 1 131529 809572
2 2 109564 642082
In [40]:
# Plotting Casual and Registered user distribution across different clusters
plt.rcParams['figure.figsize'] = 12, 8
cluster_dist.iloc[:,1:].plot.bar()
plt.title('Casual and Registered user Population across different Clusters')
plt.xticks(rotation=0)
plt.xlabel('Cluster')
plt.ylabel('User Count')
plt.tight_layout()
plt.show()
In [41]:
# Full Visualization of Cluster Features of Casual and Registered users
f, axes = plt.subplots(6, 6, figsize=(36, 32))

for k in range(3):
    temp = cluster_results[cluster_results['Cluster']==k]

    for i, col in enumerate(['season']+list(temp.columns)[4:9]):
        df_grouped = temp.groupby(by=[col], as_index=False)
        df_grouped = df_grouped[['casual','registered']].sum()
        
        axes[i,2*k].bar(x=df_grouped[col].tolist(), 
                    height=df_grouped['casual'].to_list(),
                    color=cm.tab20.colors[2*i+1])
        y_min = int(math.floor(np.min(df_grouped['casual'].to_list())/10000))*10000
        y_max = int(math.ceil(np.max(df_grouped['casual'].to_list())/10000))*10000
        axes[i,2*k].set_ylabel('User Count')
        axes[i,2*k].set_ylim([y_min, y_max])
        axes[i,2*k].set_title('\''+col+'\''+' (Cluster '+str(k+1)+', Casual)',
                              fontsize=18)
        
        axes[i,2*k+1].bar(x=df_grouped[col].tolist(), 
                    height=df_grouped['registered'].to_list(),
                    color=cm.tab20.colors[2*i])
        y_min = int(math.floor(np.min(df_grouped['registered'].to_list())/10000))*10000
        y_max = int(math.ceil(np.max(df_grouped['registered'].to_list())/10000))*10000
        axes[i,2*k+1].set_ylim([y_min, y_max])
        axes[i,2*k+1].set_title('\''+col+'\''+' (Cluster '+str(k+1)+', Registered)',
                                fontsize=18)

plt.subplots_adjust(wspace=0.05,hspace=0.05)
plt.tight_layout()
plt.show()
In [42]:
print('Total Notebook Runtime:',round((time.time()-master_tm)/60,2),'mins')
Total Notebook Runtime: 43.51 mins